%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from numpy.random import default_rng
from sklearn.gaussian_process.kernels import RBF
import pickle
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import functions as fu
import warnings
import plotly.graph_objects as go
from tqdm import tqdm
import kernel_approximation as ka
import kernel_machine_learning as kml
import stochastic_plots as stoch
Consider a real-valued random variable whose pdf is $f(x)$. The corresponding cdf is $$ F(x) = \int_{-\infty}^{x} dx' f(x'). $$ This inverse of the cdf ($F^{-1}(p)$) can be used to generate random numbers from this distribution using the following algorithm:
For instance, the pdf of an exponential distribution defined on the set of non-negative real numbers is $$ f(x) = \lambda e^{-\lambda x}, x \ge 0, $$ with $\lambda > 0$.
The corresponding cdf is $$ F(x) = \int_{0}^{x} dx \lambda e^{-\lambda x} = 1 - e^{-\lambda x}, \ x \ge 0. $$ The inverse of the cdf is such that $ F^{-1}(p) = x$. Since $p = F(x) = 1 -e^{-\lambda x}$, $x = - \log (1-p) / \lambda$. Therefore, $$ F^{-1}(p) = -\frac{1}{\lambda} \log (1-p), \ 0 \le p \le 1. $$ See, for instance, [ https://en.wikipedia.org/wiki/Exponential_distribution ]
# Generate samples from the exponential distribution.
# Exponential distribution
def exp_pdf(x, gamma):
return np.exp(- x / gamma) / gamma
def exp_cdf(x, lambd):
return 1.0 - np.exp(- x / gamma)
def exp_inverse_cdf(p, gamma):
return - gamma * np.log(1.0 - p)
# Inverse transform sampling.
gamma = 2.0
n_samples = 10000
rng = default_rng(12345)
U = rng.random(n_samples) # U ~ U[0, 1]
X = exp_inverse_cdf(U, gamma)
# Define a range for the plot.
fontsize = 14
fig, ax = stoch.plot_pdf(
X,
lambda x: exp_pdf(x, gamma),
fontsize=fontsize,
fig_num=1,
)
_ = ax.set_xlabel('$x$', fontsize=fontsize)
_ = ax.set_ylabel('pdf($x$)', fontsize=fontsize)
The exponential kernel in 1 dimension is $$ k(x, x') = k(x - x') = \exp\left(- \frac{\left|x - x' \right|}{\gamma} \right). $$
The density is proportional to the the inverse Fourier transform of $k(x)$ $$ \hbox{pdf}(w) = \frac{1}{2 \pi} \int_{-\infty}^{\infty} e^{-i x w} k(x) dx $$
Answer.-
Since we already know the expression of the required pdf and we know the expression of the kernel, we can use the kernel expression in the pdf: \begin{align*} pdf(w) = \frac{1}{2\pi} \int_{-\infty}^{\infty} e^{-i x w} k(x) dx = \frac{1}{2\pi} \int_{-\infty}^{\infty} e^{-i x w} e^{- \frac{|x|}{\gamma}} dx \end{align*} To get rid of the absolute value, we can split the integral between $\mathbb R^+$ and $\mathbb R^-$. \begin{align*} \frac{1}{2\pi} \int_{-\infty}^{\infty} e^{-i x w} e^{- \frac{|x|}{\gamma}} dx & = \frac{1}{2\pi}\left( \int_{-\infty}^{0} e^{-i x w} e^{ \frac{x}{\gamma}} dx \int_{0}^{+\infty} e^{-i x w} e^{ \frac{-x}{\gamma}}dx \right)\\ & \stackrel{(1)}{=} \frac{1}{2\pi}\left( \int_{-\infty}^{0} e^{ x \big( -iw + \frac{1}{ \gamma } \big) } dx + \int_{0}^{+\infty} e^{ x \big( -iw - \frac{1}{ \gamma } \big) } dx\right)\\ & = \frac{1}{2\pi}\left( \left[\frac{ e^{-iwx} e^{\frac{x}{\gamma}} }{-iw +\frac{1}{\gamma}}\right]_{-\infty}^{0} + \left[\frac{ e^{-iwx} e^{\frac{-x}{\gamma}} }{-iw -\frac{1}{\gamma}}\right]_{0}^{+\infty}\right) \end{align*} where, in $(1)$, we have used that the integral of the exponential is straightforward. Now, we have to recall that $$ e^{-iwx} = cos(-wx) + i sin(-wx), $$ and, thus, this term is bounded. Using this, we obtain that the limit of the terms containing exponentials is zero, that is: \begin{align*} \frac{1}{2\pi}\left( \left[\frac{ e^{-iwx} e^{\frac{x}{\gamma}} }{-iw +\frac{1}{\gamma}}\right]_{-\infty}^{0} + \left[\frac{ e^{-iwx} e^{\frac{-x}{\gamma}} }{-iw -\frac{1}{\gamma}}\right]_{0}^{+\infty}\right) & = \frac{1}{2\pi}\left( \left[\frac{1}{-iw + \frac{1}{\gamma}} - \lim_{x \to -\infty} \frac{ e^{-iwx} e^{\frac{x}{\gamma}} }{-iw + \frac{1}{\gamma}}\right] + \left[\frac{1}{-iw - \frac{1}{\gamma}} - \lim_{x \to +\infty} \frac{ e^{-iwx} e^{\frac{-x}{\gamma}} }{-iw - \frac{1}{\gamma}}\right]\right) \\ & = \frac{1}{2\pi} \left( \frac{1}{-iw + \frac{1}{\gamma}} - \frac{1}{-iw - \frac{1}{\gamma}} \right) \\ & = \frac{1}{2\pi} \left( \frac{(-iw - \frac{1}{\gamma}) - (-iw + \frac{1}{\gamma})}{(-iw + \frac{1}{\gamma})(-1)(iw + \frac{1}{\gamma})} \right) \\ & = \frac{1}{\pi} \left( \frac{-\frac{1}{\gamma}}{-w^2 - \frac{1}{\gamma^2} } \right)\\ & = \frac{1}{\pi} \frac{\gamma}{1 + w^2 \gamma^2}. \end{align*}
Until now, we have obtained that $$ pdf(w) = \frac{1}{\pi} \frac{\gamma}{1 + w^2 \gamma^2}. $$ Now, if we rename $\gamma = \frac{1}{\gamma}$ which we can do, since we have no restrictions for gamma, we obtain that the $pdf(w)$ has the form: $$ pdf(w) = \frac{1}{\pi}\frac{1}{\gamma \left(1 + \left( \frac{w}{\gamma}\right)^2\right)}, $$ which is the form of the pdf of a Cauchy distribution. Using this, we can be sure that this pdf is normalized and this is the pdf that we were searching for.
Recall that the cauchy distribution has two parameters, that is, $f(x; x_0, \gamma_c)$. In our case, $x_0 = 0$ and $\gamma_c = \frac{1}{\gamma}$.
The corresponding cumulative distribution function is: $$ \hbox{cdf}(w) = \int_{-\infty}^w \hbox{pdf}(w') dw' $$
Answer.-
The answer to this question is immediate since the CDF of a Cauchy distribution is defined. If we use our $x_0$ and $\gamma_c$, the expression is: $$ F\left(x; 0,\frac{1}{\gamma}\right)=\frac{1}{\pi} \arctan\left(\gamma x\right)+\frac{1}{2} $$
Now, we have to give an expression for the inverse of the CDF $F$. This is straightforward since we know that the quantile, if the function $F$ is continuous and strictly monotonically increasing (as in this case, since it is a CDF), is the inverse of the CDF. Hence, $$ F^{-1}\left(p;0, \frac{1}{\gamma}\right) = \frac{1}{\gamma} \,\tan\left[\pi \left(p-{\frac {1}{2}}\right)\right] $$
Although we gave the definition of the pdf that matches the definition of the Cauchy distribution, we will use the first pdf expression found: $$ pdf(w) = \frac{1}{\pi} \frac{\gamma}{1 + w^2 \gamma^2}. $$
def cauchy_pdf(x, gamma):
return (1/np.pi) * (gamma/(1 + (x*gamma)**2))
def cauchy_cdf(x, gamma):
return (1/np.pi) *np.arctan(gamma*x) + 0.5
def cauchy_inverse_cdf(p, gamma):
return (1/gamma)* np.tan(np.pi*(p-0.5))
# Inverse transform sampling.
gamma = 2.0
n_samples = 10000
rng = np.random.default_rng(12345)
U = rng.random(n_samples)
X = cauchy_inverse_cdf(U,gamma)
fontsize = 14
fig, ax = stoch.plot_pdf(
X,
lambda x: cauchy_pdf(x,gamma),
fontsize=fontsize,
fig_num=1,
)
_ = ax.set_xlabel('$x$', fontsize=fontsize)
_ = ax.set_ylabel('pdf($x$)', fontsize=fontsize)
We see that the plot is really sparse. This, however, makes sense since the tangent function goes to $+ \infty$ when the argument tends to $\pi/2$ and goes to $-\infty$ when the argument tends to $-pi/2$, so we have many points that are outliers. A possible solution could be to clip the values to an interval.
thresh = 10
X_threshed = X[np.abs(X) < thresh]
fontsize = 14
fig, ax = stoch.plot_pdf(
X_threshed,
lambda x: cauchy_pdf(x,gamma),
fontsize=fontsize,
fig_num=1,
)
_ = ax.set_xlabel('$x$', fontsize=fontsize)
_ = ax.set_ylabel('pdf($x$)', fontsize=fontsize)
Consider the exponential kernel in $D$ dimensions $$ k(\mathbf{x}, \mathbf{x}') = k(\mathbf{x}- \mathbf{x}') = \exp\left\{- \frac{\lVert \mathbf{x}- \mathbf{x}' \rVert_1}{\gamma} \right\},$$ with the $L_1$ norm $$ \lVert \mathbf{x}- \mathbf{x}' \rVert_1 = \sum_{d=1}^D \left|x_d \right|.$$
The Fourier transform of the exponential kernel in $D$ dimensions is proportional to a pdf: $$ \text{pdf}(\mathbf{w}) \propto \int_{\mathbb{R}^D} e^{-i \mathbf{w}^T \mathbf{x}} \exp\left\{- \frac{\lVert \mathbf{x} \rVert_1}{\gamma} \right\} d \mathbf{x}. $$
The inverse Fourier transform of a translationally invariante kernel $$ k(\mathbf{x}, \mathbf{x}') = k(\mathbf{x} - \mathbf{x}'). $$ is $$ \text{pdf}(\mathbf{w}) = \frac{1}{\left(2 \pi\right)^D} \int_{\mathbb{R}^D} e^{- i \mathbf{w}^T \mathbf{x}} k(\mathbf{x}) d\mathbf{x} $$
Answer.-
Firstly, recall that since $\mathbf{w,x} \in \mathbb R^D$, $\mathbf{w}^T \mathbf{x} = \sum_{i=1}^D w_ix_i$.
We have been told that the F.T. is proportional to a p.d.f.
Let us expand the RHS of the expression to which the pdf is proportional:
\begin{align*} \int_{\mathbb{R}^D} e^{-i \mathbf{w}^T \mathbf{x}} \exp\left\{- \frac{\lVert \mathbf{x} \rVert_1}{\gamma} \right\} d \mathbf{x} & = \int_{\mathbb{R}} \cdots \int_{\mathbb{R}} \exp \left\{-i \sum_{i=1}^D x_i w_i\right\} \exp \left\{ \frac{-1}{\gamma} \sum_{i=1}^D |x_i| \right\} dx_1 \cdots dx_D \\ & = \int_{\mathbb{R}} \cdots \int_{\mathbb{R}} \left( \exp \left \{ -i w_1 x_1 - \frac{|x_1|}{\gamma} \right\} \cdots \exp \left \{ -i w_D x_D - \frac{|x_D|}{\gamma} \right\} \right) dx_1 \cdots dx_D \end{align*}It can be appretiated that each integral only depends on one of the terms, so we can express that integral as the product of the integrals. \begin{align*} \int_{\mathbb{R}} \cdots \int_{\mathbb{R}} \left( \exp \left \{ -i w_1 x_1 - \frac{|x_1|}{\gamma} \right\} \cdots \exp \left \{ -i w_D x_D - \frac{|x_D|}{\gamma} \right\} \right) dx_1 \cdots dx_D & = \prod_{i=1}^D \int_{\mathbb R} \exp \left \{ -i w_D x_D - \frac{|x_D|}{\gamma} \right\} dx_i \end{align*} Lastly, we only have to see that each of the integrals is the one that we did previously in dimension $1$ in the question Q1. Hence, we obtain: \begin{align*} \int_{\mathbb{R}^D} e^{-i \mathbf{w}^T \mathbf{x}} \exp\left\{- \frac{\lVert \mathbf{x} \rVert_1}{\gamma} \right\} d \mathbf{x} = \prod_{i=1}^D pdf(w_i) \end{align*} Clearly, due to the normalization of the single pdfs, this is normalized and we obtain that the proportional expression is, in fact, an equality: $$ \text{pdf}(\mathbf{w}) = \prod_{i=1}^D \text{pdf}(w_i) $$
Hint: If the 1D implementation is vectorized, the answer to this question is straightforward.
Our previous implementation was vectorized, so the only difference is to create the samples from the uniform with the shape $(n\_samples,D)$. To be able to plot as we did before, we keep the first dimension and plot the histogram and pdf for this dimension. We decide to apply the threshold as we did before, since we found that the same outlier problem was occurring.
# Inverse transform sampling.
gamma = 2.0
n_samples = 10000
D = 100
rng = np.random.default_rng(12345)
U = rng.random((n_samples,D))
X = cauchy_inverse_cdf(U,gamma)
thresh = 10
X0_threshed = X[:,0][np.abs(X[:,0]) < 10]
fontsize = 14
fig, ax = stoch.plot_pdf(
X0_threshed,
lambda x: cauchy_pdf(x,gamma),
fontsize=fontsize,
fig_num=1,
)
_ = ax.set_xlabel('$x$', fontsize=fontsize)
_ = ax.set_ylabel('pdf($x$)', fontsize=fontsize)
X0,X1 = X[:,0],X[:,1]
thresh = 5
X_threshed = np.array([[X0[i],X1[i]] for i in range(len(X0)) if np.abs(X0[i]) < thresh and np.abs(X1[i]) < thresh])
def bin_centers(bins):
centers = (bins + (bins[1]-bins[0])/2) [:-1]
return centers
fig = plt.figure()
h = np.histogram2d(X_threshed[:,0], X_threshed[:,1], bins=50)
df = pd.DataFrame(h[0]/len(h[0]), index=bin_centers(h[1]), columns=bin_centers(h[2]))
fig = go.Figure(data=[go.Surface(z=df)])
_ = fig.show()
<Figure size 432x288 with 0 Axes>
Complete the code for the class
class RandomFeaturesSamplerExp(RandomFeaturesSampler):
in the file
kernel_approximation.py
Answer.-
After implementing the code neede in the RandomFeaturesSamplerExp class, we will perform a little example on how this class approximates the kernel.
Firstly, we generate data using the datasets from sklearn. We also plot it.
## Generate data
X,t = ka.generate_curve_dataset(1000)
ka.plot_curve_dataset(X,t)
After implementing the class in kernel_approximation.py, we plot the approximation for a range of different number of sampled features. We compare it with the original kernel using the function demo_kernel_approximation_features.
n_features = np.array([10**i for i in range(1,5)])
length_scale_kernel = 2.0
RFExpSamplers = [ka.RandomFeaturesSamplerExp(n_features_sampled = n_feats,
length_scale = length_scale_kernel)
for n_feats in n_features]
ka.demo_kernel_approximation_features(
X = X,
kernel = kml.exponential_kernel,
features_samplers = RFExpSamplers)
We know that using random features, we are approximating the kernel the following way: \begin{align*} k(\mathbf{x} - \mathbf{x'}) & = \int_{\mathbb R^D} p(\mathbf{w})\left(cos(\mathbf{w}^t \mathbf{x}) \ \ sin(\mathbf{w}^t \mathbf{x})\right) \begin{pmatrix} cos(\mathbf{w}^t \mathbf{x'}) \\ sin( \mathbf{w}^t \mathbf{x'})\end{pmatrix} d \mathbf{w} \\ & \approx \sum_{j = 1}^J \left(cos(\mathbf{w}^t \mathbf{x}) \ \ sin(\mathbf{w}^t \mathbf{x})\right) \begin{pmatrix} cos(\mathbf{w}^t \mathbf{x'}) \\ sin( \mathbf{w}^t \mathbf{x'})\end{pmatrix} \end{align*} which is a Monte Carlo approximation. We know that the Monte Carlo approximation has an error $O\left(1/\sqrt{n}\right)$. Let us draw the approximation error with a wider range of sampled features, along with the function $f(n) = \frac{1}{\sqrt{n}}$ for each number of features to see if our approximation error converges to this function.
n_features = np.linspace(10,10000,40,dtype = int)
length_scale_kernel = 2.0
RFExpSamplers = [ka.RandomFeaturesSamplerExp(n_features_sampled = n_feats,
length_scale = length_scale_kernel)
for n_feats in n_features]
ka.plot_kernel_approximation_error(
X = X,
kernel = kml.exponential_kernel,
features_samplers = RFExpSamplers,
n_features = n_features,
error_function = lambda x : 1.0/np.sqrt(x))
Consider the translationally invariant kernel $k\left(\mathbf{x}, \mathbf{x}'\right) = k\left(\mathbf{x} - \mathbf{x}'\right)$, with $\mathbf{x}, \mathbf{x}' \in \mathbb{R}^D$.
Acording to Bochner's theorem, this type of kernel can be expressed as the Fourier transform of a (possibly not normalized) density $p(\mathbf{w})$ $$ k\left(\mathbf{x} - \mathbf{x}'\right) = \int_{-\infty}^{\infty} d\mathbf{w} p\left(\mathbf{w}\right) e^{i \mathbf{w}^T \left(\mathbf{x} - \mathbf{x}'\right)}. $$ Since the kernel funtion is real and symmetric, this expression can be written as $$ k\left(\mathbf{x} - \mathbf{x}'\right) = \int_{-\infty}^{\infty} d\mathbf{w} p\left(\mathbf{w}\right) \cos \left(\mathbf{w}^T \left(\mathbf{x} - \mathbf{x}'\right) \right). $$
Show that the set of random features $$ \left\{ \left( \cos \mathbf{w}^T \mathbf{x}, \sin \mathbf{w}^T \mathbf{x} \right); \mathbf{w} \sim p(\mathbf{w})\right\}. $$ and the set $$ \left\{\sqrt{2} \cos \left(\mathbf{w}^T \mathbf{x} + b \right); \ \mathbf{w} \sim p(\mathbf{w}); \ b \sim U[0 , 2 \pi]; \ \mathbf{w} \perp b\right\} $$ provide the same approximation of the kernel.
Answer.-
In the previous approximation using sines and cosines, the approximation ends expressing the kernel as the product of feature mappings $\left(\hat{\phi_J(\mathbf{x})}\right)^T \hat{\phi_J(\mathbf{x'})}$
For this case, consider the following feature mapping: $$ \phi(\mathbf{x}) = \sqrt{2}cos(\mathbf{w}^T \mathbf{x} + b) $$ We would like to see that, if we take the expectation in both $w$ and $b$, we obtain the same approximation to the kernel. That is, in virtue of Bochner's theorem: $$ k(\mathbf{x}-\mathbf{x'}) = \mathbb E{\mathbf{w},b}\left[ \phi(\mathbf{x})^T \phi(\mathbf{x})\right] $$ Recall that the domain of $\mathbf{w}$ is $\mathbb R^D$ and the domain of $b$ is $[0,2\pi]$, but we will write the letters for convenience. Let us see this: \begin{align*} \mathbb E{\mathbf{w},b}\left[ \phi(\mathbf{x})^T \phi(\mathbf{x'})\right] & = \int_{\mathbf{w}} \text{pdf}(\mathbf w)\int_{b} \text{pdf}(b) \left(\sqrt{2}cos(\mathbf{w}^T \mathbf{x} + b) \sqrt{2}cos(\mathbf{w}^T \mathbf{x'} + b)\right) db \ d\mathbf{w}\\ & \stackrel{(1)}{=} \frac{1}{2\pi}\int_{\mathbf{w}}\text{pdf}(\mathbf w)\int_{b} \left(cos(\mathbf{w}^T(\mathbf{x}-\mathbf{x'})) - cos(\mathbf{w}^T(\mathbf{x+x'}) + 2b)\right) db d\mathbf{w}\\ & \stackrel{(2)}{=} \frac{1}{2\pi}\int_{\mathbf{w}}\text{pdf}(\mathbf w)\left(cos(\mathbf{w}^T(\mathbf{x}-\mathbf{x'})\right)\int_{b} db \ d\mathbf{w}\\ & = \int_{\mathbf{w}}\text{pdf}(\mathbf w)\left(cos(\mathbf{w}^T(\mathbf{x}-\mathbf{x'})\right) d \mathbf{w}\\ & = k(\mathbf{x-x'}) \end{align*}
Where, in $(1)$ we have used that $$ cos(a)cos(b) = \frac{1}{2}\left[cos(a+b) - cos(a-b)\right], $$ and, in $(2)$ we have used that the integral $$ \int_0^{2\pi} cos(a + 2x) dx = 0. $$
Hence, we have obtained that both sets of random features produce the same approximation of the kernel.
Fill in the corresponding code in the file
kernel_approximation.py
Answer.-
To code this, we have to simply provide an expression for the random features and its normalization factor.
self._n_random_samples_w) samples from the $U(0,2\pi)$ and, then, obtain the features using the previously computed w and the data.We have already plotted the approximations to the kernel given by the sin+cos implementation. However, it is interesting to plot it again right above the approximations given by the cos approximations just to compare them.
# Approximate the exponential kernel using Random Fourier Features.
length_scale = 2.0
A = 1.0
n_features = np.array([10**i for i in range(1,5)])
def kernel(X, Y):
return kml.exponential_kernel( X, Y, A=1.0, l=length_scale)
n_features = np.array([10**i for i in range(1,5)])
length_scale_kernel = 2.0
RFExpSamplersSinCos = [ka.RandomFeaturesSamplerExp(
n_features_sampled = n_feats,
sampling_method = 'sin+cos',
length_scale = length_scale)
for n_feats in n_features]
RFExpSamplersCos = [ka.RandomFeaturesSamplerExp(
n_features_sampled = n_feats,
sampling_method = 'cos',
length_scale = length_scale)
for n_feats in n_features]
print("Sin + Cos Sampler")
ka.demo_kernel_approximation_features(
X = X,
kernel = kml.exponential_kernel,
features_samplers = RFExpSamplersSinCos)
print("Cos Sampler")
ka.demo_kernel_approximation_features(
X = X,
kernel = kml.exponential_kernel,
features_samplers = RFExpSamplersCos)
Sin + Cos Sampler
Cos Sampler
As we can see, in both cases the original exact kernel is approximated very accurately, decreasing the mean and max error when the number of features increases. Furthermore, the mean errors do not differ in more than $0.05$ in any number of features.
Solve the classification problem using different kernel methods using the same data in
To get more stable results, the process should be repeated for $10$ different random train / test partitions of the data.
Using 5-fold cross validation on the training data, determine the optimal values of the hyperparameter for the following cases:
Determine the hyperparameter grid on which the search is made using references from the literature. Include those references in the Chicago citation format ( https://www.chicagomanualofstyle.org/tools_citationguide/citation-guide-2.html).
It may be useful to vary one of the hyperparameters while keeping the other fixed (include those plots in your report). For instance, for $n_{features}$.
The first step will be to load the data. This data is a dataset for Optical Recognition of Handwritten Digits, which can be found at the UCI or in this link. It can be straightly loaded from sklearn.
We will have to run the GridSearch in $10$ different train/test partitions for all models. We select to use $80\%$ of the data for the train set and the rest for the test set.
Determining a good set of hyperparameters in which we will be perform the GridSearch cross validation is very important since it will directly affect the final results of our models. There are many techniques and algorithms that compute certain features of our problem that help us to seek in the best direction of the parameters ((Claesen, Marc, en Bart De Moor, 2015)).
At first, we explored a few algorithms that tried to improve the hyperparameters iteratively, but we discarded them since we had no time to implement them. Then, we explored the RandomSearchCV (Bergstra, James, en Yoshua Bengio, 2012). This is not the most used technique for hyperparameter optimization, although it seemed very interesting to me. It is also implemented in sklearn in the module RandomizedSearchCV. For this technique, we do not explore the hyperparameter space by creating a grid with prefixed values (as we do in GridSearchCV), but we sample hyperparameter values from the hyperparameter distribution. This option of searching is usually much more computationally eficcient when low resources are available. However, we have to assume a distribution for each of the parameters, which we thought to be too optimistic.
In the end, we decided to perform our good old GridSearchCV. We dug into several papers found on Google Scholar, and we found a paper (Chen, Kai, Rongchun Li, Yong Dou, Zhengfa Liang, 2017) that reviews SVMs with Kernel Approximation using Nystrom Features, so we decided to use it as a reference for the hyperparameters. To sum up, we decided to use:
Before executing the complete training, we would like to make a few notes.
KFold, since we are asked to do 5-Fold CV and this is the default value for GridSearchCV.Pickle, so we created a save object and load object function (available in functions.py). functions.py) called grid_search, which is given the train set, the classifiers and the parameter space and performs a single (of the 10 desired) grid search in the parameter space. We call it $10$ times and save the results.X,y = datasets.load_digits(n_class=10, return_X_y = True)
n_samples = len(X)
X = X / 16.0
X -= X.mean(axis=0)
# Pre-define param vectors to reuse
gammas = np.array([2**i for i in range(-8,3)])
length_scale = 1.0/gammas
n_features = [10,20,50,100,300]
C = np.array([2**i for i in range(-6,6)])
# declare classifiers (name,pipeline)
classifiers = [
# 1. Non-linear SVM + RBF kernel [C, gamma]
('Nonlinear SVM',
Pipeline( [('svc_rbf', svm.SVC(kernel=RBF()))]) ),
# 2. Linear SVM + RBF random features [C, gamma, n_features]
('Linear SVM + RBF random features',
Pipeline( [('rf_rbf', ka.RandomFeaturesSamplerRBF()),
('svc_linear', svm.SVC(kernel='linear'))])),
# 3. Linear SVM + RBF Nyström features [C, gamma, n_features]
('Linear SVM + RBF Nyström features',
Pipeline( [ ('nyst_rbf_sampler', ka.NystroemFeaturesSampler(kernel = RBF())),
('svc_linear', svm.SVC(kernel='linear'))])),
# 4. Non-linear SVM + exponential kernel [C, length_scale]
('Non-linear SVM + exponential kernel',
Pipeline( [('svc_exponential',
svm.SVC(kernel=kml.exponential_kernel))]) ),
# 5. Linear SVM + exponential random features [C, length_scale, n_features]
('Linear SVM + exponential random features',
Pipeline([('exponential_rf', ka.RandomFeaturesSamplerExp()),
('svc_linear', svm.SVC(kernel='linear'))])),
# 6. Linear SVM + exponential Nyström features [C, length_scale, n_features]
('Linear SVM + exponential Nyström features',
Pipeline([('exponential_nystrom', ka.NystroemFeaturesSampler(kernel = RBF())),
('svc_linear', svm.SVC(kernel='linear'))]))
]
param_search_space = [
# 1. Non-linear SVM + RBF kernel [C, gamma]
{'svc_rbf__C': C,
'svc_rbf__gamma': gammas},
# 2. Linear SVM + RBF random features [C, gamma, n_features]
# Gamma is called sigma_kernel in our case
{'svc_linear__C': C,
'rf_rbf__sigma_kernel': gammas,
'rf_rbf__n_features_sampled': n_features},
#3. Linear SVM + RBF Nyström features [C, gamma, n_features]
# The attribute gamma is called length_scale in this case
{'svc_linear__C': C,
'nyst_rbf_sampler__kernel__length_scale': gammas,
'nyst_rbf_sampler__n_features_sampled': n_features},
#4. Non-linear SVM + exponential kernel [C, length_scale]
{'svc_exponential__C': C,
'svc_exponential__gamma': length_scale},
#5. Linear SVM + exponential random features [C, length_scale, n_features]
{'svc_linear__C': C,
'exponential_rf__length_scale':length_scale,
'exponential_rf__n_features_sampled': n_features},
#6. Linear SVM + exponential Nyström features [C, length_scale, n_features]
{'svc_linear__C': C,
'exponential_nystrom__kernel__length_scale': length_scale,
'exponential_nystrom__n_features_sampled': n_features}
]
n_trains = 10
# 1 Hour of execution
retrain = False
if retrain:
for it in tqdm(range(n_trains)):
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
# Change random state
# in each iteration
random_state = it)
clf = fu.grid_search(X_train,
y_train,
classifiers,
param_search_space,
it,
False)
from collections import defaultdict
all_iterations = defaultdict(list)
for i in range(n_trains):
clf = fu.load_object("results/iteration-{}.p".format(i))
for key in clf.keys():
all_iterations[key].append(clf[key])
for key in all_iterations.keys():
models = all_iterations[key]
avg_best_score = np.mean([model.best_score_ for model in models])
idx_best = fu.find_best_model(models)
print(key)
print("Average best score: {}".format(avg_best_score))
fu.print_best_param_score(models[idx_best])
Nonlinear SVM Average best score: 0.9890730255516841 - C : 1.0 - gamma : 0.00390625 Mean cross-validated score of the best_estimator Train: 0.9916448896631824 --- Linear SVM + RBF random features Average best score: 0.9887937959736739 - n_features_sampled : 300 - sigma_kernel : 2.0 - C : 16.0 Mean cross-validated score of the best_estimator Train: 0.9909601238869532 --- Linear SVM + RBF Nyström features Average best score: 0.9900471835075493 - length_scale : 2.0 - n_features_sampled : 300 - C : 32.0 Mean cross-validated score of the best_estimator Train: 0.9923441734417343 --- Non-linear SVM + exponential kernel Average best score: 0.9679178765001936 - C : 1.0 - gamma : 256.0 Mean cross-validated score of the best_estimator Train: 0.9735554587688734 --- Linear SVM + exponential random features Average best score: 0.9782171893147504 - length_scale : 16.0 - n_features_sampled : 300 - C : 8.0 Mean cross-validated score of the best_estimator Train: 0.9805216802168021 --- Linear SVM + exponential Nyström features Average best score: 0.9898395760743322 - length_scale : 2.0 - n_features_sampled : 300 - C : 32.0 Mean cross-validated score of the best_estimator Train: 0.9916473093302361 ---
It has to be remarked that GridSearchCV retrains in the whole train set a model with the best hyperparameter configuration found, this covers the second point required for the report.
We have found the best hyperparameters for each of the models. Now, our interest is to find how the number of sampled random features affects the final score of our model. To obtain this, we will fix the best hyperparameters (other than n_sampled_features and we will retrain with these fixed hyperparameters, moving the number of features.
Important addition.- In the hyperparameter first setting, we only tested small values for the n_sampled_features parameter, stating that the results in our followed documentation (Chen, Kai, Rongchun Li, Yong Dou, Zhengfa Liang, 2017) are good enough using small numbers. We will check this now in our case, adding bigger possibilities to the n_sampled_features parameter.
n_features = [10,20,50,100,300,500,1000,2000]
param_search_space_n_features = [
# 1. Non-linear SVM + RBF kernel [C, gamma]
{'svc_rbf__C': [1.0],
'svc_rbf__gamma': [0.00390625]},
# 2. Linear SVM + RBF random features [C, gamma, n_features]
# Gamma is called sigma_kernel in our case
{'svc_linear__C': [16.0],
'rf_rbf__sigma_kernel': [2.0],
'rf_rbf__n_features_sampled': n_features},
#3. Linear SVM + RBF Nyström features [C, gamma, n_features]
# The attribute gamma is called length_scale in this case
{'svc_linear__C': [32.0],
'nyst_rbf_sampler__kernel__length_scale': [2.0],
'nyst_rbf_sampler__n_features_sampled': n_features},
#4. Non-linear SVM + exponential kernel [C, length_scale]
{'svc_exponential__C': [256.0],
'svc_exponential__gamma': [1.0]},
#5. Linear SVM + exponential random features [C, length_scale, n_features]
{'svc_linear__C': [8.0],
'exponential_rf__length_scale':[16.0],
'exponential_rf__n_features_sampled': n_features},
#6. Linear SVM + exponential Nyström features [C, length_scale, n_features]
{'svc_linear__C': [32.0],
'exponential_nystrom__kernel__length_scale': [2.0],
'exponential_nystrom__n_features_sampled': n_features}
]
n_trains = 10
retrain = False
if retrain:
for it in tqdm(range(n_trains)):
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
# Change random state
# in each iteration
random_state = it)
clf = fu.grid_search(X_train,
y_train,
classifiers,
param_search_space_n_features,
it,
False)
We load the saved GridSearch results
optimal = defaultdict(list)
for i in range(n_trains):
clf = fu.load_object("results-optimal/iteration-{}.p".format(i))
for key in clf.keys():
optimal[key].append(clf[key])
We now create this cell that gets the results in a dictionary so that we are able to plot them later. We have to distinguish between the case where the model has n_sampled_features as a hyperparameter and the case where the model does not have it. In the second case, we set the Cross Validation error to constant.
train_scores = defaultdict(str)
train_stds = defaultdict(str)
cv_scores = defaultdict(str)
cv_stds = defaultdict(str)
times = defaultdict(str)
for key in optimal.keys():
models = optimal[key]
for model in models:
# Check if it has the parameter n_featuers
has_n_features = False
params = models[0].best_params_.keys()
for p in params:
if 'n_features_sampled' in p:
has_n_features = True
break
train_scores[key] = []
cv_scores[key] = []
train_stds[key] = []
cv_stds[key] = []
times[key] = []
# Constant value for no nfeatures changing
if not has_n_features:
idx = fu.find_best_model(models)
train_scores[key].append(np.repeat(models[idx].best_score_,len(n_features)))
train_stds[key].append(np.repeat(0.0,len(n_features)))
cv_scores[key].append(np.repeat(models[idx].best_score_,len(n_features)))
cv_stds[key].append(np.repeat(0.0,len(n_features)))
times[key].append(np.repeat(model.cv_results_['mean_fit_time'],len(n_features)))
else:
train_scores[key].append(model.best_score_)
train_stds[key].append(np.repeat(0.0,len(n_features)))
cv_scores[key].append(model.cv_results_['mean_test_score'])
cv_stds[key].append(model.cv_results_['std_test_score'])
times[key].append(model.cv_results_['mean_fit_time'])
We now plot the mean_cross_validation_error, filling between $\pm 2* \text{std cv score}$.
fig = plt.figure(figsize=(15,9))
ax = fig.add_subplot()
for model in cv_scores.keys():
ax.plot(n_features,1- cv_scores[model][0],label = model)
ax.fill_between(n_features,
1- cv_scores[model][0] + 2.0*cv_stds[model][0],
1- cv_scores[model][0] - 2.0*cv_stds[model][0],
alpha = 0.3 )
ax.set_xlabel("Random Features")
ax.set_ylabel("Error")
_ = ax.set_title("CV Errors for each model")
_ = ax.legend()
_ = plt.show()
In the above figure, we can appretiate how our preliminar statement about the number of features seems correct in the validation set. As we stated, when the number of features is higher than approximately $500$, the errors of all the classifiers in the cross validation steps has converged to a very similar value.
Now, let us focus on the first components to see with more detail how the models perform individually:
fig = plt.figure(figsize=(15,9))
ax = fig.add_subplot()
plot_n_features = 4
for model in cv_scores.keys():
ax.plot(n_features[0:plot_n_features],1- cv_scores[model][0][0:plot_n_features],label = model)
ax.fill_between(n_features[0:plot_n_features],
1- cv_scores[model][0][0:plot_n_features] + 2.0*cv_stds[model][0][0:plot_n_features],
1- cv_scores[model][0][0:plot_n_features] - 2.0*cv_stds[model][0][0:plot_n_features],
alpha = 0.3 )
ax.set_xlabel("Random Features")
ax.set_ylabel("Error")
_ = ax.set_title("CV Errors for each model")
_ = ax.legend()
_ = plt.show()
In the cross validation process, we can conclude that:
Non Linear SVM + exponential Kernel performs just a little worse than the Non Linear SVM + RBF kernel.Now, let us see how the train time is affected.
fig = plt.figure(figsize=(15,9))
ax = fig.add_subplot()
for model in cv_scores.keys():
ax.plot(n_features[0:4],times[model][0][0:4],label = model)
ax.set_xlabel("Random Features")
ax.set_ylabel("Fit Time")
_ = ax.set_title("Fit Train Times for each model")
_ = ax.legend()
_ = plt.show()
As we can see, the methods using random features take less time to fit, achieving very good results on cross validation, so we can say that we are obtaining our objective of maintaining the accuracy while reducing the time training by approximating the kernel function.
[1] : Claesen, Marc, en Bart De Moor. “Hyperparameter Search in Machine Learning”. arXiv [cs.LG], 2015. arXiv. http://arxiv.org/abs/1502.02127.
[2] : Chen, Kai, Rongchun Li, Yong Dou, Zhengfa Liang, en Qi Lv. “Ranking support vector machine with kernel approximation”. Computational intelligence and neuroscience 2017 (2017).
[3] : Bergstra, James, en Yoshua Bengio. “Random Search for Hyper-Parameter Optimization”. Journal of Machine Learning Research 13, no 10 (2012): 281–305. http://jmlr.org/papers/v13/bergstra12a.html.